{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "import nltk\n",
    "from nltk.corpus import stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of lines: 50000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9999_0</td>\n",
       "      <td>Watching Time Chasers, it obvious that it was ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>45057_0</td>\n",
       "      <td>I saw this film about 20 years ago and remembe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15561_0</td>\n",
       "      <td>Minor Spoilers&lt;br /&gt;&lt;br /&gt;In New York, Joan Ba...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7161_0</td>\n",
       "      <td>I went to see this film with a great deal of e...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>43971_0</td>\n",
       "      <td>Yes, I agree with everyone on this site this m...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id                                             review\n",
       "0   9999_0  Watching Time Chasers, it obvious that it was ...\n",
       "1  45057_0  I saw this film about 20 years ago and remembe...\n",
       "2  15561_0  Minor Spoilers<br /><br />In New York, Joan Ba...\n",
       "3   7161_0  I went to see this film with a great deal of e...\n",
       "4  43971_0  Yes, I agree with everyone on this site this m..."
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读入数据\n",
    "def load_dataset(name,nrows=None):\n",
    "    datasets = {\n",
    "        'unlabeled_train':'unlabeledTrainData.tsv',\n",
    "        'labeled_train':'labeledTrainData.tsv',\n",
    "        'test':'testData.tsv'\n",
    "    }\n",
    "    if name not in datasets:\n",
    "        raise ValueError(name)\n",
    "    datafile = os.path.join('.','datas',datasets[name])\n",
    "    df = pd.read_csv(datafile,sep=\"\\t\",escapechar=\"\\\\\",nrows=nrows)\n",
    "    return df\n",
    "df = load_dataset('unlabeled_train',50000)\n",
    "print(\"Number of lines: {}\".format(len(df)))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,"
     ]
    }
   ],
   "source": [
    "# 数据清洗\n",
    "def clean_sentence(sentence):\n",
    "    sentence = sentence.lower() #0,归一化处理，全部转为小写\n",
    "    sentence = BeautifulSoup(sentence,'html.parser').get_text()#1，去掉html标签\n",
    "    sentence = re.sub('[\\W]',' ',sentence)#2，移除标点\n",
    "    tokens = nltk.word_tokenize(sentence)#3，切分词/token\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    sentence_words = [i for i in sentence.split() if i not in stop_words]#4,去掉停用词\n",
    "    return ' '.join(sentence_words)#5，重组为新的句子\n",
    "\n",
    "# count = 0 # 打印进度\n",
    "def split_sentences(review):\n",
    "#     global count\n",
    "#     count += 1\n",
    "#     if count % 100 == 0:\n",
    "#          print(count//100,end=\",\")\n",
    "    sentence = clean_sentence(review)\n",
    "    sentences_words = sentence.split()\n",
    "    return sentences_words\n",
    "sentences = [x for x in df.review.apply(split_sentences)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50000\n",
      "[['watching', 'time', 'chasers', 'obvious', 'made', 'bunch', 'friends', 'maybe', 'sitting', 'around', 'one', 'day', 'film', 'school', 'said', 'hey', 'let', 'pool', 'money', 'together', 'make', 'really', 'bad', 'movie', 'something', 'like', 'ever', 'said', 'still', 'ended', 'making', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'barrel', 'stock', 'music', 'etc', 'corners', 'cut', 'except', 'one', 'would', 'prevented', 'film', 'release', 'life', 'like']]\n"
     ]
    }
   ],
   "source": [
    "print(len(sentences))\n",
    "print(sentences[:1])# sentences 应该是一个二维数组！！！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training model...\n"
     ]
    }
   ],
   "source": [
    "\"\"\"使用gensim训练词嵌入模型\"\"\"\n",
    "# 设定词向量训练的参数\n",
    "num_features = 300# word vector dimensionality ——建议300-500维度 \n",
    "min_word_count = 40# minimum word count  推荐40\n",
    "num_workers = 4#number of threads to run in parallel\n",
    "context = 10# context window size\n",
    "downsampling = 1e-3 # downsample setting for frequnet words\n",
    "model_name = '{}featurs_{}minwords_{}context.model'.format(num_features,min_word_count,context)\n",
    "\n",
    "print('training model...')\n",
    "model = Word2Vec(sentences,workers=num_workers,size=num_features,min_count = min_word_count,window=context,sample=downsampling)\n",
    "model.init_sims(replace=True)\n",
    "model.save(os.path.join('.','models',model_name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat\n",
      "0.5969352\n",
      "[('lady', 0.6617084741592407), ('conchita', 0.6416019797325134), ('prostitute', 0.6292552947998047), ('husband', 0.6276531219482422), ('lover', 0.6242899894714355), ('sexually', 0.6181201934814453), ('man', 0.5969352126121521), ('women', 0.5967299938201904), ('whore', 0.5816379189491272), ('pregnant', 0.5698956847190857)]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\24544\\.conda\\envs\\chatbot\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `doesnt_match` (Method will be removed in 4.0.0, use self.wv.doesnt_match() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "print(model.doesnt_match(\"man woman child cat\".split())) # 找出不匹配的词语\n",
    "print(model.wv.similarity(\"woman\",\"man\")) # 打印两个词的相关性\n",
    "print(model.wv.most_similar(\"woman\")) # 关联度最高的词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删除不需要的变量\n",
    "del model\n",
    "del sentences\n",
    "del df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "接下来是使用之前保存好的模型，结合labeledData来做训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"在word2vec上训练情感分析模型\"\"\"\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "# 读入之前训练好的word2vec模型\n",
    "model_name = model_name\n",
    "model = Word2Vec.load(os.path.join('.','models',model_name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       id  sentiment                                             review\n",
      "0  5814_8          1  With all this stuff going down at the moment w...\n",
      "1  2381_9          1  \"The Classic War of the Worlds\" by Timothy Hin...\n",
      "2  7759_3          0  The film starts with a manager (Nicholas Bell)...\n",
      "3  3630_4          0  It must be assumed that those who praised this...\n",
      "4  9495_8          1  Superbly trashy and wondrously unpretentious 8...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\24544\\.conda\\envs\\chatbot\\lib\\site-packages\\ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  import sys\n",
      "c:\\users\\24544\\.conda\\envs\\chatbot\\lib\\site-packages\\ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  import sys\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>290</th>\n",
       "      <th>291</th>\n",
       "      <th>292</th>\n",
       "      <th>293</th>\n",
       "      <th>294</th>\n",
       "      <th>295</th>\n",
       "      <th>296</th>\n",
       "      <th>297</th>\n",
       "      <th>298</th>\n",
       "      <th>299</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.001585</td>\n",
       "      <td>-0.000567</td>\n",
       "      <td>0.032747</td>\n",
       "      <td>-0.003920</td>\n",
       "      <td>-0.023202</td>\n",
       "      <td>0.015403</td>\n",
       "      <td>-0.008614</td>\n",
       "      <td>-0.010279</td>\n",
       "      <td>0.012291</td>\n",
       "      <td>-0.003230</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.012411</td>\n",
       "      <td>0.007878</td>\n",
       "      <td>0.003054</td>\n",
       "      <td>-0.002946</td>\n",
       "      <td>-0.008337</td>\n",
       "      <td>-0.005934</td>\n",
       "      <td>-0.001328</td>\n",
       "      <td>0.012154</td>\n",
       "      <td>-0.004700</td>\n",
       "      <td>-0.017998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.011214</td>\n",
       "      <td>-0.010502</td>\n",
       "      <td>-0.001437</td>\n",
       "      <td>-0.005376</td>\n",
       "      <td>-0.031425</td>\n",
       "      <td>-0.003963</td>\n",
       "      <td>0.003215</td>\n",
       "      <td>0.006335</td>\n",
       "      <td>0.020740</td>\n",
       "      <td>-0.013269</td>\n",
       "      <td>...</td>\n",
       "      <td>0.011174</td>\n",
       "      <td>0.002982</td>\n",
       "      <td>0.000119</td>\n",
       "      <td>-0.004681</td>\n",
       "      <td>0.006098</td>\n",
       "      <td>-0.012677</td>\n",
       "      <td>0.015745</td>\n",
       "      <td>0.001920</td>\n",
       "      <td>-0.005046</td>\n",
       "      <td>-0.021951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.004807</td>\n",
       "      <td>-0.010583</td>\n",
       "      <td>-0.024568</td>\n",
       "      <td>0.001196</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>-0.008517</td>\n",
       "      <td>-0.000997</td>\n",
       "      <td>-0.002218</td>\n",
       "      <td>-0.013086</td>\n",
       "      <td>-0.005278</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.000991</td>\n",
       "      <td>-0.017428</td>\n",
       "      <td>-0.000680</td>\n",
       "      <td>-0.020089</td>\n",
       "      <td>0.029553</td>\n",
       "      <td>-0.004461</td>\n",
       "      <td>-0.004151</td>\n",
       "      <td>-0.015569</td>\n",
       "      <td>-0.009800</td>\n",
       "      <td>0.008643</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000204</td>\n",
       "      <td>-0.011484</td>\n",
       "      <td>-0.006328</td>\n",
       "      <td>-0.019119</td>\n",
       "      <td>-0.010072</td>\n",
       "      <td>-0.006927</td>\n",
       "      <td>-0.002075</td>\n",
       "      <td>0.008648</td>\n",
       "      <td>0.003871</td>\n",
       "      <td>-0.016238</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.015881</td>\n",
       "      <td>-0.001592</td>\n",
       "      <td>-0.008767</td>\n",
       "      <td>-0.013480</td>\n",
       "      <td>0.005728</td>\n",
       "      <td>0.015701</td>\n",
       "      <td>0.009930</td>\n",
       "      <td>0.018884</td>\n",
       "      <td>0.002761</td>\n",
       "      <td>-0.014817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.004331</td>\n",
       "      <td>-0.002867</td>\n",
       "      <td>-0.007977</td>\n",
       "      <td>-0.006313</td>\n",
       "      <td>0.001415</td>\n",
       "      <td>0.008218</td>\n",
       "      <td>-0.006616</td>\n",
       "      <td>-0.014684</td>\n",
       "      <td>-0.000503</td>\n",
       "      <td>0.008105</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.012473</td>\n",
       "      <td>-0.014938</td>\n",
       "      <td>0.001253</td>\n",
       "      <td>-0.015154</td>\n",
       "      <td>0.009919</td>\n",
       "      <td>0.001044</td>\n",
       "      <td>-0.008464</td>\n",
       "      <td>0.003701</td>\n",
       "      <td>-0.008495</td>\n",
       "      <td>0.000546</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 300 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        0         1         2         3         4         5         6    \\\n",
       "0  0.001585 -0.000567  0.032747 -0.003920 -0.023202  0.015403 -0.008614   \n",
       "1 -0.011214 -0.010502 -0.001437 -0.005376 -0.031425 -0.003963  0.003215   \n",
       "2 -0.004807 -0.010583 -0.024568  0.001196  0.029349 -0.008517 -0.000997   \n",
       "3  0.000204 -0.011484 -0.006328 -0.019119 -0.010072 -0.006927 -0.002075   \n",
       "4 -0.004331 -0.002867 -0.007977 -0.006313  0.001415  0.008218 -0.006616   \n",
       "\n",
       "        7         8         9    ...       290       291       292       293  \\\n",
       "0 -0.010279  0.012291 -0.003230  ... -0.012411  0.007878  0.003054 -0.002946   \n",
       "1  0.006335  0.020740 -0.013269  ...  0.011174  0.002982  0.000119 -0.004681   \n",
       "2 -0.002218 -0.013086 -0.005278  ... -0.000991 -0.017428 -0.000680 -0.020089   \n",
       "3  0.008648  0.003871 -0.016238  ... -0.015881 -0.001592 -0.008767 -0.013480   \n",
       "4 -0.014684 -0.000503  0.008105  ... -0.012473 -0.014938  0.001253 -0.015154   \n",
       "\n",
       "        294       295       296       297       298       299  \n",
       "0 -0.008337 -0.005934 -0.001328  0.012154 -0.004700 -0.017998  \n",
       "1  0.006098 -0.012677  0.015745  0.001920 -0.005046 -0.021951  \n",
       "2  0.029553 -0.004461 -0.004151 -0.015569 -0.009800  0.008643  \n",
       "3  0.005728  0.015701  0.009930  0.018884  0.002761 -0.014817  \n",
       "4  0.009919  0.001044 -0.008464  0.003701 -0.008495  0.000546  \n",
       "\n",
       "[5 rows x 300 columns]"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = load_dataset('labeled_train')\n",
    "print(df.head())\n",
    "\n",
    "\n",
    "def to_review_vector(review):\n",
    "    words = clean_sentence(review).split()\n",
    "    array = np.array([model[w] for w in words if w in model])\n",
    "    return pd.Series(array.mean(axis=0))\n",
    "\n",
    "train_data_features = df.review.apply(to_review_vector)\n",
    "train_data_features.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "mean()函数功能：求取均值\n",
    "mean() 函数定义：numpy.mean(a, axis, dtype, out，keepdims )\n",
    "经常操作的参数为axis，以m * n矩阵举例\n",
    "\n",
    "axis 不设置值，对 m*n 个数求均值，返回一个实数\n",
    "axis = 0：压缩行，对各列求均值，返回 1* n 矩阵 1行n列[x,x,x,x...]\n",
    "axis =1 ：压缩列，对各行求均值，返回 m *1 矩阵 n行1列[y,y,y,y...]的转置\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"用随机森林构建分类器\"\"\"\n",
    "forest = RandomForestClassifier(n_estimators=100,random_state=42) # 构建100棵树\n",
    "forest = forest.fit(train_data_features,df.sentiment)#训练词向量（词向量的特征值）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[12500,     0],\n",
       "       [    0, 12500]], dtype=int64)"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 在训练集上试试，确保模型正常work\n",
    "confusion_matrix(df.sentiment,forest.predict(train_data_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删除内存占用\n",
    "del df\n",
    "del train_data_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         id                                             review\n",
      "0  12311_10  Naturally in a film who's main themes are of m...\n",
      "1    8348_2  This movie is a disaster within a disaster fil...\n",
      "2    5828_4  All in all, this is a movie for kids. We saw i...\n",
      "3    7186_2  Afraid of the Dark left me with the impression...\n",
      "4   12128_7  A very accurate depiction of small time mob li...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\24544\\.conda\\envs\\chatbot\\lib\\site-packages\\ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  import sys\n",
      "c:\\users\\24544\\.conda\\envs\\chatbot\\lib\\site-packages\\ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  import sys\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  sentiment\n",
       "0  12311_10          1\n",
       "1    8348_2          0\n",
       "2    5828_4          0\n",
       "3    7186_2          0\n",
       "4   12128_7          1"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"预测测试集\"\"\"\n",
    "df = load_dataset('test')\n",
    "print(df.head())\n",
    "\n",
    "test_data_features = df.review.apply(to_review_vector)\n",
    "result = forest.predict(test_data_features)\n",
    "output = pd.DataFrame({'id':df.id,'sentiment':result})\n",
    "output.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
